using Tables, Plots, CSV, DataFrames, GLM, Statistics

#const datapath = "G:/My Drive/4_clean"
const datapath = "G:/.shortcut-targets-by-id/13aZKIXEbgJup9gg08y-2EkiYw5XHdk6c/Platform-Externalities/4_clean" #"G:/My Drive/4_clean"
const figurepath = "C:/Users/akapor/Documents/GitHub/Platform-Externalities/5_paper/Round 2/Paper/figures/mcmc"
const tablepath = "C:/Users/akapor/Documents/GitHub/Platform-Externalities/5_paper/Round 2/Paper/tables"
const outputpath = "$(datapath)/from_Julia/chain2"

#df_main = CSV.read(outputpath*"/counterfactuals_big.csv",DataFrame)
#df_extra = CSV.read(outputpath*"/counterfactuals_additional_big.csv",DataFrame)
dfs_fit = Dict(year=> CSV.read(outputpath*"/simulate$(year)_big.csv",DataFrame) for year in 2010:2012)
dfs_covariates_old = Dict(year=> CSV.read(outputpath*"/covariates$(year).csv",DataFrame) for year in 2010:2012)
dfs_programs = Dict(year=> CSV.read(outputpath*"/programs$(year).csv",DataFrame) for year in 2010:2012)
dfs_data = Dict(year=> CSV.read(outputpath*"/dataoutcomes$(year).csv",DataFrame) for year in 2010:2012)
dfs_covariates = Dict(year=> CSV.read(datapath*"/students_julia_$(year).csv",DataFrame) for year in 2010:2012)

function runregressions(df)
    r1 = lm(@formula(enrollG25 ~ mate + leng + nem + is2010 + is2012),df)
    r2 = lm(@formula(graduateG25 ~ mate + leng + nem + is2010 + is2012),df)
    r2a = lm(@formula(graduateG25 ~ mate + leng + nem + is2010 + is2012),df[df.enrollG25 .> 0, :])
    r3 = lm(@formula(enrollAny ~ mate + leng + nem + is2010 + is2012),df)
    r4 = lm(@formula(graduate ~ mate + leng + nem + is2010 + is2012),df)
    r4a = lm(@formula(graduate ~ mate + leng + nem + is2010 + is2012),df[df.enrollAny .> 0, :])
    return (enrollG25=r1, graduateG25=r2, graduate!enrollG25=r2a, enroll=r3, graduate=r4, graduate!enroll=r4a)
end

ndraw = maximum(dfs_fit[2012].draw)
results_data = []
results_sim = []
for draw=0:ndraw
    println("draw $draw")
    for sim in (draw > 0 ? ["a","b"] : ["",])
        df_yrs = []
        for year in 2010:2012
            if draw > 0 #simulation draw
                mykeys = [:ind;:type;[Symbol("simulate$(year)$(sim)_$x") for x in ["placement","enroll","graduate","welfare"]]]
                df = dfs_fit[year][dfs_fit[year].draw .== draw, mykeys]
                df.year = [year for t=1:nrow(df)]
                rename!(df,Dict(Symbol("simulate$(year)$(sim)_$x")=>Symbol(x) for x in ["placement","enroll","graduate","welfare"] ))
            else #data
                df = copy(dfs_data[year])
                rename!(df,Dict(:place=>:placement))
                df.type = dfs_covariates_old[year].type
            end
            #merge to covariates -- need GPA!
            df.mate = (dfs_covariates[year].mate .- 500)./110
            df.leng = (dfs_covariates[year].leng .- 500)./110
            df.nem = (dfs_covariates[year].nem .- 500)./110
            df.type2 = df.type.==2
            df.type3 = df.type.==3
            df.type4 = df.type.==4
            # construct G25 indicators
            df.admitG25 = 1.0 .* [(x==0 ? 0 : !(dfs_programs[year].G8[x])) for x in df.placement]
            df.enrollG25 = 1.0 .* [(x==0 ? 0 : !(dfs_programs[year].G8[x])) for x in df.enroll]
            df.graduateG25 = df.enrollG25 .* df.graduate
            #
            df.admitAny = df.placement .> 0
            df.enrollAny = df.enroll .> 0
            push!(df_yrs,df)
        end
        df = vcat(df_yrs...)
        df.is2010 = 1.0 .* (df.year .== 2010)
        df.is2012 = 1.0 .* (df.year .== 2012)
        results = Dict(0=>runregressions(df))
        for type=1:4
            push!(results,type=>runregressions(df[df.type.==type,:]))
        end
        if draw==0
            push!(results_data,results)
        else
            push!(results_sim,results)
        end
    end
end

coef_model(type,x) = mean(coef(getfield(r[type],x)) for r in results_sim)
coef_data(type,x) = mean(coef(getfield(r[type],x)) for r in results_data)

sd_model(type,x) = std([coef(getfield(r[type],x)) for r in results_sim])
sd_data(type,x) = stderror(getfield(results_data[1][type],x))

N_model(type,x) = Int(round(mean(nobs(getfield(r[type],x)) for r in results_sim)))
N_data(type,x) = Int(round(mean(nobs(getfield(r[type],x)) for r in results_data)))


function makeEventStudyTable(out, vars=[:enroll,:graduate,:graduate!enroll]; tag = "G33",
        typenames = ["All", "Male Private", "Male Public", "Female Private", "Female Public"])
    rownames = ["Constant", "Math", "Language", "GPA", "2010", "2012"]
    println(out,"\\begin{tabular}{lcccccc}") #var, enrollG25(data), graduateG25(data), enrollG33(data), graduateG33(data), ""model
    println(out,"\\toprule")
    println(out,repeat(" & \\multicolumn{1}{c}{Data} & \\multicolumn{1}{c}{Model} ",length(vars))*"\\\\")
    colnames = ["\\multicolumn{1}{c}{$x}" for x in repeat(["Enroll $tag", "Graduate $tag", "Graduate \$\\vert\$ Enroll $tag"],inner=2)]
    println(out,"& "*join(colnames," & ")*" \\\\")
    for (_typ,typename) in enumerate(typenames)
        typ = _typ-1
        println(out,"\\midrule")
        println(out,"\\multicolumn{7}{c}{$typename} \\\\")
        mycoefs = zeros(length(rownames),0)
        mysd = zeros(length(rownames),0)
        for vv in vars
            mycoefs = [mycoefs round.(coef_data(typ,vv),digits=3) round.(coef_model(typ,vv),digits=3)]
            mysd = [mysd round.(sd_data(typ,vv),digits=4) round.(sd_model(typ,vv),digits=4)]
        end
        # mycoefs = round.([coef_data(typ,vars[1]) coef_data(typ,vars[2]) coef_data(typ,vars[3]) coef_model(typ,vars[1]) coef_model(typ,vars[2]) coef_model(typ,vars[3])],digits=3)
        # mysd = round.([sd_data(typ,vars[1]) sd_data(typ,vars[2]) sd_data(typ,vars[3]) sd_model(typ,vars[1]) sd_model(typ,vars[2]) sd_model(typ,vars[3])],digits=4)
        for (r,rowname) in enumerate(rownames)
            println(out,"$rowname & $(join(string.(mycoefs[r,:])," & "))"*" \\\\")
            sd_tmp = "("*join(string.(mysd[r,:]),") & (")*")"
            println(out, " & $sd_tmp \\\\")
        end
        println(out,"\\hline")
        println(out,"Obs. & $(N_data(typ,vars[1])) & & $(N_data(typ,vars[2])) & & $(N_data(typ,vars[3])) & \\\\")
    end
    println(out,"\\hline \\hline")
    println(out,"\\end{tabular}")
    #println(out,"\\\\ \\footnotetext{Note: this table shows estimates of each outcome, for each type of student, for the years 2010-2012.  The base year is 2011.}")
end

open(tablepath*"/eventStudyWithinModel.tex","w") do f
    makeEventStudyTable(f)
end

open(tablepath*"/eventStudyWithinModelSmall.tex","w") do f
    makeEventStudyTable(f,typenames=["All",])
end

open(tablepath*"/eventStudyWithinModelG25only.tex","w") do f
    makeEventStudyTable(f,[:enrollG25,:graduateG25,:graduate!enrollG25],tag="G25")
end





#======
describe options for ref response
=#
options = CSV.read(datapath*"/options_julia_2010-12.csv",DataFrame)

options[options.program.==250, [:MajorName, :Univ, :proceso]]

weights_cols = [:x_b_nem, :x_b_psu_leng, :x_b_psu_mate, :x_b_psu_hria, :x_b_psu_cien]
_options_grouped = groupby(options,:program)

df_result = copy(options[1:2,:],copycols=true)
for grp in _options_grouped
    if sort(grp.proceso) == [2010,2011,2012]
        append!(df_result,grp)
    end
end
delete!(df_result,1:2)
options_grouped = groupby(df_result,:program)
# options_grouped = _options_grouped

og_201011 = groupby(df_result[(df_result.proceso .== 2010) .| (df_result.proceso.==2011),:],:program)
og_201112 = groupby(df_result[(df_result.proceso .== 2011) .| (df_result.proceso.==2012),:],:program)

no_change = [ all(length( unique(group[!,col])) == 1 for col in weights_cols) for group in options_grouped]
nc1 = [ all(length( unique(group[!,col])) == 1 for col in weights_cols) for group in og_201011]
nc2 = [ all(length( unique(group[!,col])) == 1 for col in weights_cols) for group in og_201112]

println("means: $(1-mean(no_change)), $(1-mean(nc1)), $(1-mean(nc2))")

#options_grouped[4][!,[:MajorName, :Univ, :proceso, weights_cols...]]

#indexvariation = combine(options_grouped,  [vv => (x -> [extrema(x)]) => [Symbol("min_$vv"), Symbol("max_$vv")] for vv in weights_cols])
indexvariation = combine(options_grouped,  [vv => (x -> extrema(x)[2]-extrema(x)[1]) => Symbol(vv) for vv in weights_cols])

[mean(indexvariation[!,vv].>5) for vv in weights_cols] #for any element, no more than 7\%

mean( any(Array(indexvariation[!,weights_cols]) .> 0,dims=2) ) #24.8%
mean( any(Array(indexvariation[!,weights_cols]) .> 5,dims=2) ) #17.7%%
mean( any(Array(indexvariation[!,weights_cols]) .> 10,dims=2) ) #5.38%
